# install.packages("tidyverse")
# install.packages("timetk")
library(tidyverse)
library(readr) #read_csv()
library(dplyr)
library(glmnet)
library(caret) # may not need anymore
library(timetk)

1) Load Data

# Load data:
df <- read_csv("./data/walmart/Walmart.csv")
head(df)

Fix Datatypes, Sort Values

# Fixing datatypes:
# any(grepl("/", df$Date))
print(sapply(df, class))
       Store         Date Weekly_Sales Holiday_Flag  Temperature   Fuel_Price          CPI 
   "numeric"  "character"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
Unemployment 
   "numeric" 
df$Date <- gsub("/", "-", df$Date) # replace slashes with dashes, so all dates follow same format
df$Date <- as.Date(df$Date, format = "%d-%m-%Y") # NOTE european-like date: day/month/year
print(sapply(df, class))
       Store         Date Weekly_Sales Holiday_Flag  Temperature   Fuel_Price          CPI 
   "numeric"       "Date"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
Unemployment 
   "numeric" 
# Sort values:
df <- df[order(df$Date, decreasing = FALSE), ] # gets an ordering of rows based on date and then df[ix] to select that order
head(df)
print(dim(df))
[1] 6435    8
table(df$Date) # value_counts

2010-02-05 2010-02-12 2010-02-19 2010-02-26 2010-03-05 2010-03-12 2010-03-19 2010-03-26 
        45         45         45         45         45         45         45         45 
2010-04-02 2010-04-09 2010-04-16 2010-04-23 2010-04-30 2010-05-07 2010-05-14 2010-05-21 
        45         45         45         45         45         45         45         45 
2010-05-28 2010-06-04 2010-06-11 2010-06-18 2010-06-25 2010-07-02 2010-07-09 2010-07-16 
        45         45         45         45         45         45         45         45 
2010-07-23 2010-07-30 2010-08-06 2010-08-13 2010-08-20 2010-08-27 2010-09-03 2010-09-10 
        45         45         45         45         45         45         45         45 
2010-09-17 2010-09-24 2010-10-01 2010-10-08 2010-10-15 2010-10-22 2010-10-29 2010-11-05 
        45         45         45         45         45         45         45         45 
2010-11-12 2010-11-19 2010-11-26 2010-12-03 2010-12-10 2010-12-17 2010-12-24 2010-12-31 
        45         45         45         45         45         45         45         45 
2011-01-07 2011-01-14 2011-01-21 2011-01-28 2011-02-04 2011-02-11 2011-02-18 2011-02-25 
        45         45         45         45         45         45         45         45 
2011-03-04 2011-03-11 2011-03-18 2011-03-25 2011-04-01 2011-04-08 2011-04-15 2011-04-22 
        45         45         45         45         45         45         45         45 
2011-04-29 2011-05-06 2011-05-13 2011-05-20 2011-05-27 2011-06-03 2011-06-10 2011-06-17 
        45         45         45         45         45         45         45         45 
2011-06-24 2011-07-01 2011-07-08 2011-07-15 2011-07-22 2011-07-29 2011-08-05 2011-08-12 
        45         45         45         45         45         45         45         45 
2011-08-19 2011-08-26 2011-09-02 2011-09-09 2011-09-16 2011-09-23 2011-09-30 2011-10-07 
        45         45         45         45         45         45         45         45 
2011-10-14 2011-10-21 2011-10-28 2011-11-04 2011-11-11 2011-11-18 2011-11-25 2011-12-02 
        45         45         45         45         45         45         45         45 
2011-12-09 2011-12-16 2011-12-23 2011-12-30 2012-01-06 2012-01-13 2012-01-20 2012-01-27 
        45         45         45         45         45         45         45         45 
2012-02-03 2012-02-10 2012-02-17 2012-02-24 2012-03-02 2012-03-09 2012-03-16 2012-03-23 
        45         45         45         45         45         45         45         45 
2012-03-30 2012-04-06 2012-04-13 2012-04-20 2012-04-27 2012-05-04 2012-05-11 2012-05-18 
        45         45         45         45         45         45         45         45 
2012-05-25 2012-06-01 2012-06-08 2012-06-15 2012-06-22 2012-06-29 2012-07-06 2012-07-13 
        45         45         45         45         45         45         45         45 
2012-07-20 2012-07-27 2012-08-03 2012-08-10 2012-08-17 2012-08-24 2012-08-31 2012-09-07 
        45         45         45         45         45         45         45         45 
2012-09-14 2012-09-21 2012-09-28 2012-10-05 2012-10-12 2012-10-19 2012-10-26 
        45         45         45         45         45         45         45 

There are 45 stores, so this makes sense, that there are 45 entries for each date. Also, each date represents one week.

Normalize weekly sales within each store, so that no one store dominates, and weekly sales are comparable: i.e. (sales_store - mean(sales_store)) / std(sales_store) and can convert back later via (scaled_sales * store_std) + store_mean

df <- df %>%
  group_by(Store) %>%
  mutate(
    sales_mean = mean(Weekly_Sales, na.rm = TRUE),
    sales_std   = sd(Weekly_Sales, na.rm = TRUE),
    weekly_sales_scaled = (Weekly_Sales - sales_mean) / sales_std
  ) %>%
  ungroup()
head(df)
for (store_id in 1:length(unique(df$Store))) {
  p <- df %>%
    filter(Store == store_id) %>%
    ggplot(aes(x = Date, y = weekly_sales_scaled)) +
    geom_line(color = "steelblue") +
    labs(title = paste("Weekly Sales Over Time - Store", store_id),
         x = "Date",
         y = "Weekly Sales (scaled per store)")
  print(p)
  }

NA
NA

2) Train/Test split on sorted data:

# Can't randomize the data first, because it is time-series
split_ix <- floor(.8*nrow(df))
train_df <- df[1:split_ix,]
test_df <- df[(split_ix+1):nrow(df),]

print(dim(train_df))
[1] 5148   11
print(dim(test_df))
[1] 1287   11
cat("train:", format(min(train_df$Date), "%Y-%m-%d"), " to ", format(max(train_df$Date), "%Y-%m-%d"),"\n")
train: 2010-02-05  to  2012-04-13 
cat("test:", format(min(test_df$Date), "%Y-%m-%d"), " to ", format(max(test_df$Date), "%Y-%m-%d"),"\n")
test: 2012-04-13  to  2012-10-26 
print(colnames(df))
 [1] "Store"               "Date"                "Weekly_Sales"       
 [4] "Holiday_Flag"        "Temperature"         "Fuel_Price"         
 [7] "CPI"                 "Unemployment"        "sales_mean"         
[10] "sales_std"           "weekly_sales_scaled"
# model.matrix creates a design matrix, auto-encodes categorical as dummy vars (if any--in this case, holiday is already a double.)
# X_train <- model.matrix(weekly_sales_scaled ~ . - Store - Date - Weekly_Sales - sales_mean - sales_std + 0, data=train_df) # + 0 means no intercept
X_test <- model.matrix(weekly_sales_scaled ~ . - Store - Date - Weekly_Sales - sales_mean - sales_std + 0, data=test_df)
# y_train <- train_df$weekly_sales_scaled
y_test <- test_df$weekly_sales_scaled
# NOTE: i just commented out X_train because realized cv function requires dataframe.

print(dim(X_test))
[1] 1287    5
#print(dim(X_train))
#print(X_train[1:10,])
print(X_test[1:10,])
   Holiday_Flag Temperature Fuel_Price      CPI Unemployment
1             0       44.42      4.187 137.8680        8.150
2             0       45.68      4.044 214.3127        7.139
3             0       69.03      3.891 221.1484        6.891
4             0       49.89      4.025 141.8434        7.671
5             0       41.81      4.025 137.8680        4.125
6             0       48.65      4.187 137.8680        8.983
7             0       42.46      4.044 214.3127        7.139
8             0       36.90      4.025 137.8680        7.489
9             0       52.22      4.187 141.8434        8.253
10            0       64.28      4.254 131.1080       11.627

3) LASSO with Rolling Time-series cross validation

Because, it is important to keep the future in the test, or else it could cheat and accidentally learn the past and use it to predict the past, some inherent pattern. i.e. could use info from the future to predict the past (leakage).

cv_folds <- time_series_cv(
  data = train_df,
  date_var = Date,
  cumulative = FALSE, # no growing window (same size folds)
  initial = "6 months", # length of train window
  assess = "3 months", # length of validation window
  skip = "3 months", # jump between folds
  slice_limit = 5 # num folds max
)

plot_time_series_cv_plan(cv_folds, .date_var = Date, .value = weekly_sales_scaled)

Old code—-

Set up cross validation folds, for time-series.

n_train <- nrow(train_df)
folds <- createTimeSlices(1:n_train, 
                          initialWindow = floor(0.7*n_train), # 70% is train data
                          horizon = floor(0.1*n), # 10% is val data 
                          fixedWindow = TRUE) # do not grow the time slice windows with time--do rolling. 

str(folds$train[1:3])

4) LASSO - baseline, NO lagged variables.

LS0tDQp0aXRsZTogIkxhc3NvIC0gV2FsbWFydCINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCmBgYHtyfQ0KIyBpbnN0YWxsLnBhY2thZ2VzKCJ0aWR5dmVyc2UiKQ0KIyBpbnN0YWxsLnBhY2thZ2VzKCJ0aW1ldGsiKQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpsaWJyYXJ5KHJlYWRyKSAjcmVhZF9jc3YoKQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoZ2xtbmV0KQ0KbGlicmFyeShjYXJldCkgIyBtYXkgbm90IG5lZWQgYW55bW9yZQ0KbGlicmFyeSh0aW1ldGspDQpgYGANCg0KDQojIyMgMSkgTG9hZCBEYXRhDQpgYGB7cn0NCiMgTG9hZCBkYXRhOg0KZGYgPC0gcmVhZF9jc3YoIi4vZGF0YS93YWxtYXJ0L1dhbG1hcnQuY3N2IikNCmhlYWQoZGYpDQpgYGANCg0KKipGaXggRGF0YXR5cGVzLCBTb3J0IFZhbHVlcyoqDQpgYGB7cn0NCiMgRml4aW5nIGRhdGF0eXBlczoNCiMgYW55KGdyZXBsKCIvIiwgZGYkRGF0ZSkpDQpwcmludChzYXBwbHkoZGYsIGNsYXNzKSkNCmRmJERhdGUgPC0gZ3N1YigiLyIsICItIiwgZGYkRGF0ZSkgIyByZXBsYWNlIHNsYXNoZXMgd2l0aCBkYXNoZXMsIHNvIGFsbCBkYXRlcyBmb2xsb3cgc2FtZSBmb3JtYXQNCmRmJERhdGUgPC0gYXMuRGF0ZShkZiREYXRlLCBmb3JtYXQgPSAiJWQtJW0tJVkiKSAjIE5PVEUgZXVyb3BlYW4tbGlrZSBkYXRlOiBkYXkvbW9udGgveWVhcg0KcHJpbnQoc2FwcGx5KGRmLCBjbGFzcykpDQoNCiMgU29ydCB2YWx1ZXM6DQpkZiA8LSBkZltvcmRlcihkZiREYXRlLCBkZWNyZWFzaW5nID0gRkFMU0UpLCBdICMgZ2V0cyBhbiBvcmRlcmluZyBvZiByb3dzIGJhc2VkIG9uIGRhdGUgYW5kIHRoZW4gZGZbaXhdIHRvIHNlbGVjdCB0aGF0IG9yZGVyDQpoZWFkKGRmKQ0KYGBgDQpgYGB7cn0NCnByaW50KGRpbShkZikpDQp0YWJsZShkZiREYXRlKSAjIHZhbHVlX2NvdW50cw0KYGBgDQoNCioqVGhlcmUgYXJlIDQ1IHN0b3Jlcywgc28gdGhpcyBtYWtlcyBzZW5zZSwgdGhhdCB0aGVyZSBhcmUgNDUgZW50cmllcyBmb3IgZWFjaCBkYXRlLiBBbHNvLCBlYWNoIGRhdGUgcmVwcmVzZW50cyBvbmUgd2Vlay4qKg0KDQoNCioqTm9ybWFsaXplIHdlZWtseSBzYWxlcyB3aXRoaW4gZWFjaCBzdG9yZSwgc28gdGhhdCBubyBvbmUgc3RvcmUgZG9taW5hdGVzLCBhbmQgd2Vla2x5IHNhbGVzIGFyZSBjb21wYXJhYmxlOioqDQppLmUuIChzYWxlc19zdG9yZSAtIG1lYW4oc2FsZXNfc3RvcmUpKSAvIHN0ZChzYWxlc19zdG9yZSkNCmFuZCBjYW4gY29udmVydCBiYWNrIGxhdGVyIHZpYSAoc2NhbGVkX3NhbGVzICogc3RvcmVfc3RkKSArIHN0b3JlX21lYW4NCmBgYHtyfQ0KZGYgPC0gZGYgJT4lDQogIGdyb3VwX2J5KFN0b3JlKSAlPiUNCiAgbXV0YXRlKA0KICAgIHNhbGVzX21lYW4gPSBtZWFuKFdlZWtseV9TYWxlcywgbmEucm0gPSBUUlVFKSwNCiAgICBzYWxlc19zdGQgICA9IHNkKFdlZWtseV9TYWxlcywgbmEucm0gPSBUUlVFKSwNCiAgICB3ZWVrbHlfc2FsZXNfc2NhbGVkID0gKFdlZWtseV9TYWxlcyAtIHNhbGVzX21lYW4pIC8gc2FsZXNfc3RkDQogICkgJT4lDQogIHVuZ3JvdXAoKQ0KaGVhZChkZikNCmBgYA0KDQpgYGB7cn0NCmZvciAoc3RvcmVfaWQgaW4gMTpsZW5ndGgodW5pcXVlKGRmJFN0b3JlKSkpIHsNCiAgcCA8LSBkZiAlPiUNCiAgICBmaWx0ZXIoU3RvcmUgPT0gc3RvcmVfaWQpICU+JQ0KICAgIGdncGxvdChhZXMoeCA9IERhdGUsIHkgPSB3ZWVrbHlfc2FsZXNfc2NhbGVkKSkgKw0KICAgIGdlb21fbGluZShjb2xvciA9ICJzdGVlbGJsdWUiKSArDQogICAgbGFicyh0aXRsZSA9IHBhc3RlKCJXZWVrbHkgU2FsZXMgT3ZlciBUaW1lIC0gU3RvcmUiLCBzdG9yZV9pZCksDQogICAgICAgICB4ID0gIkRhdGUiLA0KICAgICAgICAgeSA9ICJXZWVrbHkgU2FsZXMgKHNjYWxlZCBwZXIgc3RvcmUpIikNCiAgcHJpbnQocCkNCiAgfQ0KDQoNCmBgYA0KDQojIyMgMikgVHJhaW4vVGVzdCBzcGxpdCBvbiBzb3J0ZWQgZGF0YToNCmBgYHtyfQ0KIyBDYW4ndCByYW5kb21pemUgdGhlIGRhdGEgZmlyc3QsIGJlY2F1c2UgaXQgaXMgdGltZS1zZXJpZXMNCnNwbGl0X2l4IDwtIGZsb29yKC44Km5yb3coZGYpKQ0KdHJhaW5fZGYgPC0gZGZbMTpzcGxpdF9peCxdDQp0ZXN0X2RmIDwtIGRmWyhzcGxpdF9peCsxKTpucm93KGRmKSxdDQoNCnByaW50KGRpbSh0cmFpbl9kZikpDQpwcmludChkaW0odGVzdF9kZikpDQpjYXQoInRyYWluOiIsIGZvcm1hdChtaW4odHJhaW5fZGYkRGF0ZSksICIlWS0lbS0lZCIpLCAiIHRvICIsIGZvcm1hdChtYXgodHJhaW5fZGYkRGF0ZSksICIlWS0lbS0lZCIpLCJcbiIpDQpjYXQoInRlc3Q6IiwgZm9ybWF0KG1pbih0ZXN0X2RmJERhdGUpLCAiJVktJW0tJWQiKSwgIiB0byAiLCBmb3JtYXQobWF4KHRlc3RfZGYkRGF0ZSksICIlWS0lbS0lZCIpLCJcbiIpDQoNCnByaW50KGNvbG5hbWVzKGRmKSkNCiMgbW9kZWwubWF0cml4IGNyZWF0ZXMgYSBkZXNpZ24gbWF0cml4LCBhdXRvLWVuY29kZXMgY2F0ZWdvcmljYWwgYXMgZHVtbXkgdmFycyAoaWYgYW55LS1pbiB0aGlzIGNhc2UsIGhvbGlkYXkgaXMgYWxyZWFkeSBhIGRvdWJsZS4pDQojIFhfdHJhaW4gPC0gbW9kZWwubWF0cml4KHdlZWtseV9zYWxlc19zY2FsZWQgfiAuIC0gU3RvcmUgLSBEYXRlIC0gV2Vla2x5X1NhbGVzIC0gc2FsZXNfbWVhbiAtIHNhbGVzX3N0ZCArIDAsIGRhdGE9dHJhaW5fZGYpICMgKyAwIG1lYW5zIG5vIGludGVyY2VwdA0KWF90ZXN0IDwtIG1vZGVsLm1hdHJpeCh3ZWVrbHlfc2FsZXNfc2NhbGVkIH4gLiAtIFN0b3JlIC0gRGF0ZSAtIFdlZWtseV9TYWxlcyAtIHNhbGVzX21lYW4gLSBzYWxlc19zdGQgKyAwLCBkYXRhPXRlc3RfZGYpDQojIHlfdHJhaW4gPC0gdHJhaW5fZGYkd2Vla2x5X3NhbGVzX3NjYWxlZA0KeV90ZXN0IDwtIHRlc3RfZGYkd2Vla2x5X3NhbGVzX3NjYWxlZA0KIyBOT1RFOiBpIGp1c3QgY29tbWVudGVkIG91dCBYX3RyYWluIGJlY2F1c2UgcmVhbGl6ZWQgY3YgZnVuY3Rpb24gcmVxdWlyZXMgZGF0YWZyYW1lLg0KDQpwcmludChkaW0oWF90ZXN0KSkNCiNwcmludChkaW0oWF90cmFpbikpDQojcHJpbnQoWF90cmFpblsxOjEwLF0pDQpwcmludChYX3Rlc3RbMToxMCxdKQ0KYGBgDQoNCiMjIyAzKSBMQVNTTyB3aXRoIFJvbGxpbmcgVGltZS1zZXJpZXMgY3Jvc3MgdmFsaWRhdGlvbg0KQmVjYXVzZSwgaXQgaXMgaW1wb3J0YW50IHRvIGtlZXAgdGhlIGZ1dHVyZSBpbiB0aGUgdGVzdCwgb3IgZWxzZSBpdCBjb3VsZCBjaGVhdCBhbmQgDQphY2NpZGVudGFsbHkgbGVhcm4gdGhlIHBhc3QgYW5kIHVzZSBpdCB0byBwcmVkaWN0IHRoZSBwYXN0LCBzb21lIGluaGVyZW50IHBhdHRlcm4uDQppLmUuIGNvdWxkIHVzZSBpbmZvIGZyb20gdGhlIGZ1dHVyZSB0byBwcmVkaWN0IHRoZSBwYXN0IChsZWFrYWdlKS4NCg0KYGBge3J9DQpjdl9mb2xkcyA8LSB0aW1lX3Nlcmllc19jdigNCiAgZGF0YSA9IHRyYWluX2RmLA0KICBkYXRlX3ZhciA9IERhdGUsDQogIGN1bXVsYXRpdmUgPSBGQUxTRSwgIyBubyBncm93aW5nIHdpbmRvdyAoc2FtZSBzaXplIGZvbGRzKQ0KICBpbml0aWFsID0gIjYgbW9udGhzIiwgIyBsZW5ndGggb2YgdHJhaW4gd2luZG93DQogIGFzc2VzcyA9ICIzIG1vbnRocyIsICMgbGVuZ3RoIG9mIHZhbGlkYXRpb24gd2luZG93DQogIHNraXAgPSAiMyBtb250aHMiLCAjIGp1bXAgYmV0d2VlbiBmb2xkcw0KICBzbGljZV9saW1pdCA9IDUgIyBudW0gZm9sZHMgbWF4DQopDQoNCnBsb3RfdGltZV9zZXJpZXNfY3ZfcGxhbihjdl9mb2xkcywgLmRhdGVfdmFyID0gRGF0ZSwgLnZhbHVlID0gd2Vla2x5X3NhbGVzX3NjYWxlZCkNCmBgYA0KDQoNCg0KDQojIyMgT2xkIGNvZGUtLS0tDQpTZXQgdXAgY3Jvc3MgdmFsaWRhdGlvbiBmb2xkcywgZm9yIHRpbWUtc2VyaWVzLg0KDQpgYGB7cn0NCm5fdHJhaW4gPC0gbnJvdyh0cmFpbl9kZikNCmZvbGRzIDwtIGNyZWF0ZVRpbWVTbGljZXMoMTpuX3RyYWluLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgaW5pdGlhbFdpbmRvdyA9IGZsb29yKDAuNypuX3RyYWluKSwgIyA3MCUgaXMgdHJhaW4gZGF0YQ0KICAgICAgICAgICAgICAgICAgICAgICAgICBob3Jpem9uID0gZmxvb3IoMC4xKm4pLCAjIDEwJSBpcyB2YWwgZGF0YSANCiAgICAgICAgICAgICAgICAgICAgICAgICAgZml4ZWRXaW5kb3cgPSBUUlVFKSAjIGRvIG5vdCBncm93IHRoZSB0aW1lIHNsaWNlIHdpbmRvd3Mgd2l0aCB0aW1lLS1kbyByb2xsaW5nLiANCg0Kc3RyKGZvbGRzJHRyYWluWzE6M10pDQoNCmBgYA0KDQoNCiMjIyA0KSBMQVNTTyAtIGJhc2VsaW5lLCBOTyBsYWdnZWQgdmFyaWFibGVzLg0KYGBge3J9DQoNCmBgYA0KDQo=